1 Setup

Set the working dir

setwd("/mnt/picea/projects/aspseq/emellerowicz/Cazymes/Selected-Tissues")
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
source("~/Git/UPSCb/src/R/expressionSpecificityUtility.R")
source("~/Git/UPSCb/src/R/percentile.R")
hpal <- colorRampPalette(colors = c("blue","white","red"))(100)
dat <- read.delim2("~/Git/UPSCb/projects/aspen-cazymes/doc/Cazy-selected.txt")

2 Hierarchical Clustering

sdat <- t(scale(t(as.matrix(dat[,4:26]))))

Remove non expressed cazymes

sel <- !rowSums(is.na(sdat)) == 23
hc.s <- hclust(d=pearson.dist(t(sdat[sel,])),method = "ward.D2")
plot(hc.s)

hc.g <- hclust(d=pearson.dist(sdat[sel,]),method = "ward.D2")
plot(hc.g,labels=dat$CAZY[sel])

3 Heatmap

heatmap.2(sdat[sel,],
          distfun = pearson.dist,
          hclustfun = function(X){hclust(X,method = "ward.D2")},
          labRow=dat$CAZY[sel],trace="none",
          las=2,col=hpal,
          key=TRUE,main="Tissue expression",
          cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))

pdf("cazymes-all-tissues.pdf",width = 110,height=110)
heatmap.2(sdat[sel,],
          distfun = pearson.dist,
          hclustfun = function(X){hclust(X,method = "ward.D2")},
          labRow=dat$ID[sel],trace="none",
          las=2,col=hpal,
          key=FALSE,main="Tissue expression",
          cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
dev.off()
## png 
##   2

Re-create the hierarchical clustering

f.hclust <- hclust(pearson.dist(sdat[sel,]),"ward.D2")
ct <- cutree(f.hclust,k=50)

check the cluster sizes

range(table(ct))
## [1]  4 56

Per cluster, print the heatmap and export the gene list

sapply(1:50,function(i){
  sel3 <- ct==i
  res <- heatmap.2(sdat[sel,][sel3,],
                   distfun = pearson.dist,
                   hclustfun = function(X){hclust(X,method = "ward.D2")},
                   labRow=dat$ID[sel][sel3],trace="none",
                   las=2,col=hpal,
                   key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
                   cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
  pdf(sprintf("cazymes-all-tissues-%02d.pdf",i),width = 12,height=8)
  heatmap.2(sdat[sel,][sel3,],
            distfun = pearson.dist,
            hclustfun = function(X){hclust(X,method = "ward.D2")},
            labRow=dat$ID[sel][sel3],trace="none",
            las=2,col=hpal,
            key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
            cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
  dev.off()
  write(rev(dat$ID[sel][sel3][res$rowInd]),file = sprintf("cazymes-all-tissues-cluster%02d.txt",i))
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL
## 
## [[28]]
## NULL
## 
## [[29]]
## NULL
## 
## [[30]]
## NULL
## 
## [[31]]
## NULL
## 
## [[32]]
## NULL
## 
## [[33]]
## NULL
## 
## [[34]]
## NULL
## 
## [[35]]
## NULL
## 
## [[36]]
## NULL
## 
## [[37]]
## NULL
## 
## [[38]]
## NULL
## 
## [[39]]
## NULL
## 
## [[40]]
## NULL
## 
## [[41]]
## NULL
## 
## [[42]]
## NULL
## 
## [[43]]
## NULL
## 
## [[44]]
## NULL
## 
## [[45]]
## NULL
## 
## [[46]]
## NULL
## 
## [[47]]
## NULL
## 
## [[48]]
## NULL
## 
## [[49]]
## NULL
## 
## [[50]]
## NULL

4 Tissue specificity

samples.cluster <- cutree(hc.s,k=5)
sort(samples.cluster)
##                Buds_Dormant_SLU.May                 Flowers_Dormant_SLU 
##                                   1                                   1 
##               Flowers_Expanding_SLU         Leaves_Freshly_Expanded_SLU 
##                                   1                                   1 
##          Leaves_Young_Expanding_SLU            Suckers_Whole_Sucker_SLU 
##                                   1                                   1 
##        Buds_Pre_chilling_SLU_July30                  Petiole_Mature_SLU 
##                                   2                                   2 
##            Roots_Control_Greenhouse            Roots_Drought_Greenhouse 
##                                   2                                   2 
##               Twigs_Non_Girdled_SLU                Flowers_Expanded_SLU 
##                                   2                                   3 
##                    Seeds_Mature_SLU           Leaves_Beetle_Damaged_Lab 
##                                   3                                   4 
##           Leaves_Control_Greenhouse           Leaves_Drought_Greenhouse 
##                                   4                                   4 
##            Leaves_Mature_2_SLU.Aug3            Leaves_Mature_SLU.July30 
##                                   4                                   4 
## Leaves_Mechanical_Damage_Greenhouse         Leaves_Undamaged_Greenhouse 
##                                   4                                   4 
##                             Cambium                              Phloem 
##                                   5                                   5 
##                               Xylem 
##                                   5
es <- expressionSpecificity(as.matrix(dat[,4:26]),
                            tissues = as.character(factor(samples.cluster)),
                            mode = "local",output = "complete")
## Calculating tissue specificity
exp.peak <- as.character(1:5)[sapply(apply(es[,as.character(1:5)] == es[,"maxn"],1,which),"[",1)]
ord <- order(exp.peak,(1-es[,"score"]))

s.ord <- order(samples.cluster)

# heatmap.2(sdat[ord,s.ord],
#           #distfun = pearson.dist,
#           #hclustfun = function(X){hclust(X,method = "ward.D2")},
#           dendrogram = "none",
#           Colv = FALSE, Rowv = FALSE,
#           labRow=dat$CAZY[ord],trace="none",
#           colRow = names(samples.cluster)[s.ord],
#           las=2,col=hpal,
#           key=FALSE,main="Tissue expression",
#           cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
tissues <- names(samples.cluster)[c(4,14,11,23,19,21)]

Remove cazymes with insufficient variance

percentile(rowSds(sdat[sel,match(tissues,colnames(sdat))]))
##           0%           1%           2%           3%           4% 
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 3.040471e-17 
##           5%           6%           7%           8%           9% 
## 3.040471e-17 6.080942e-17 2.407618e-01 3.790760e-01 4.412825e-01 
##          10%          11%          12%          13%          14% 
## 4.825874e-01 5.224530e-01 5.517175e-01 5.683728e-01 5.891265e-01 
##          15%          16%          17%          18%          19% 
## 6.271915e-01 6.518213e-01 6.707525e-01 6.958483e-01 7.093240e-01 
##          20%          21%          22%          23%          24% 
## 7.235366e-01 7.308492e-01 7.480326e-01 7.592478e-01 7.775784e-01 
##          25%          26%          27%          28%          29% 
## 7.846098e-01 7.990258e-01 8.073055e-01 8.178325e-01 8.282935e-01 
##          30%          31%          32%          33%          34% 
## 8.445277e-01 8.531262e-01 8.604260e-01 8.733740e-01 8.790009e-01 
##          35%          36%          37%          38%          39% 
## 8.866508e-01 8.950814e-01 9.015552e-01 9.098341e-01 9.158462e-01 
##          40%          41%          42%          43%          44% 
## 9.264510e-01 9.374317e-01 9.456656e-01 9.494436e-01 9.529627e-01 
##          45%          46%          47%          48%          49% 
## 9.601532e-01 9.680697e-01 9.759768e-01 9.832499e-01 9.913180e-01 
##          50%          51%          52%          53%          54% 
## 1.001037e+00 1.005045e+00 1.009298e+00 1.017451e+00 1.028613e+00 
##          55%          56%          57%          58%          59% 
## 1.034414e+00 1.044330e+00 1.054655e+00 1.063295e+00 1.068333e+00 
##          60%          61%          62%          63%          64% 
## 1.076516e+00 1.079805e+00 1.085405e+00 1.098573e+00 1.108120e+00 
##          65%          66%          67%          68%          69% 
## 1.116825e+00 1.127411e+00 1.134946e+00 1.138400e+00 1.143678e+00 
##          70%          71%          72%          73%          74% 
## 1.147028e+00 1.153913e+00 1.163572e+00 1.170221e+00 1.178502e+00 
##          75%          76%          77%          78%          79% 
## 1.186533e+00 1.194572e+00 1.204599e+00 1.209675e+00 1.223638e+00 
##          80%          81%          82%          83%          84% 
## 1.237802e+00 1.245941e+00 1.259554e+00 1.272969e+00 1.288225e+00 
##          85%          86%          87%          88%          89% 
## 1.304631e+00 1.322101e+00 1.334147e+00 1.344490e+00 1.363526e+00 
##          90%          91%          92%          93%          94% 
## 1.388256e+00 1.410480e+00 1.440648e+00 1.467026e+00 1.491709e+00 
##          95%          96%          97%          98%          99% 
## 1.516316e+00 1.530270e+00 1.564936e+00 1.629950e+00 1.721797e+00 
##         100% 
## 1.957890e+00
sel2 <- rowSds(sdat[sel,match(tissues,colnames(sdat))]) >= 0.05

Plot for the selected tissues and remaining cazymes

heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,],
          distfun = pearson.dist,
          hclustfun = function(X){hclust(X,method = "ward.D2")},
          labRow=dat$CAZY[sel][sel2],trace="none",
          las=2,col=hpal,
          key=FALSE,main="Tissue expression",
          cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))

pdf("cazymes-selected-tissues-all.pdf",width = 110,height=110)
heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,],
          distfun = pearson.dist,
          hclustfun = function(X){hclust(X,method = "ward.D2")},
          labRow=dat$ID[sel][sel2],trace="none",
          las=2,col=hpal,
          key=FALSE,main="Tissue expression",
          cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
dev.off()
## png 
##   2

Re-create the hierarchical clustering

f.hclust <- hclust(pearson.dist(sdat[sel,match(tissues,colnames(sdat))][sel2,]),"ward.D2")
ct <- cutree(f.hclust,k=40)

check the cluster sizes

range(table(ct))
## [1]  4 46

Per cluster, print the heatmap and export the gene list

sapply(1:40,function(i){
  sel3 <- ct==i
  res <- heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,][sel3,],
                   distfun = pearson.dist,
                   hclustfun = function(X){hclust(X,method = "ward.D2")},
                   labRow=dat$ID[sel][sel2][sel3],trace="none",
                   las=2,col=hpal,
                   key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
                   cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
  pdf(sprintf("cazymes-selected-tissues-%02d.pdf",i),width = 12,height=8)
  heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,][sel3,],
                   distfun = pearson.dist,
                   hclustfun = function(X){hclust(X,method = "ward.D2")},
                   labRow=dat$ID[sel][sel2][sel3],trace="none",
                   las=2,col=hpal,
                   key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
                   cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
  dev.off()
  write(rev(dat$ID[sel][sel2][sel3][res$rowInd]),file = sprintf("cluster%02d.txt",i))
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL
## 
## [[28]]
## NULL
## 
## [[29]]
## NULL
## 
## [[30]]
## NULL
## 
## [[31]]
## NULL
## 
## [[32]]
## NULL
## 
## [[33]]
## NULL
## 
## [[34]]
## NULL
## 
## [[35]]
## NULL
## 
## [[36]]
## NULL
## 
## [[37]]
## NULL
## 
## [[38]]
## NULL
## 
## [[39]]
## NULL
## 
## [[40]]
## NULL

Session Info

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] matrixStats_0.53.1      hyperSpec_0.99-20171005 ggplot2_2.2.1          
## [4] lattice_0.20-35         gplots_3.0.1           
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.16        knitr_1.20          magrittr_1.5       
##  [4] munsell_0.4.3       colorspace_1.3-2    R6_2.2.2           
##  [7] rlang_0.2.0         plyr_1.8.4          stringr_1.3.0      
## [10] caTools_1.17.1      tools_3.4.3         gtable_0.2.0       
## [13] KernSmooth_2.23-15  latticeExtra_0.6-28 htmltools_0.3.6    
## [16] gtools_3.5.0        lazyeval_0.2.1      yaml_2.1.19        
## [19] rprojroot_1.3-2     digest_0.6.15       tibble_1.4.2       
## [22] RColorBrewer_1.1-2  bitops_1.0-6        testthat_2.0.0     
## [25] evaluate_0.10.1     rmarkdown_1.9       gdata_2.18.0       
## [28] stringi_1.2.2       pillar_1.2.2        compiler_3.4.3     
## [31] scales_0.5.0        backports_1.1.2